# -*- coding: utf-8 -*-
import json,random
import scrapy


cate_info = {
    "yule":"346", #娱乐
    "junshi":"347", #军事
    "keji": "348", #科技
    "qiche": "349", #汽车
    "lishi":"350", #人文(历史)
    "guoji":"351", #国际
    "youxi": "352", #游戏
    "qinggan":"353", #情感
    "jiaju":"354", #家居
    "caijing": "355", #财经
    "xingzuo": "356", #星座
    "shishang": "357", #时尚
}

class EastDaySpider(scrapy.Spider):
    name = 'dftt'
    data = {
        "newkey": "",
        "pgnum": "1",
        "idx": "48",
        "key": "45aefdd37d1f493e",
        "softtype": "TouTiao",
        "softname": "DFTTAndroid",
        "ime": "867252030721293",
        "appqid": "xiaomi181116",
        "apptypeid": "DFTT",
        "ver": "2.2.7",
        "os": "Android8.1.0",
        "ttaccid": "null",
        "appver": "020207",
        "deviceid": "7d7f08a2c1cebe0d",
        "position": "浙江",
        "iswifi": "wifi",
        "channellabel": "null",
        "citypos": "杭州",
        "sublocal": "萧山区",
        "hispos": "浙江,杭州",
        "ispack_s": "1",
        "sclog": "1",
        "devicetp": "0",
        "devicemode": "0"
    }

    def start_requests(self):
        for key in cate_info:
            cate = cate_info[key]
            self.data["type"] = key
            yield scrapy.FormRequest(
                url='https://toutiao.eastday.com/toutiao_h5/RefreshJP?type={}&startkey=&lastkey=&tagskey=&readhistory=&idx=0&pgnum=1&newsnum=20'.format(key),
                formdata=self.data,callback=self.parse,meta={"cate": cate})

    def parse(self, response):
        res_list = json.loads(response.text[5:-1]).get('data')
        cate = response.meta["cate"]
        for info in res_list:
            item = dict()
            item['url'] = info.get('url').replace('mobile','a')
            if not item['url']:
                continue
            if 'mini.eastday.com' not in item['url']:
                continue
            if info.get('video_link'): #去掉有视频的
                continue
            item['title'] = info.get('topic')
            item['author'] = info.get('source').replace('东方头条 ','')
            item['publish_time'] = info.get('date')+":00"
            item['comment_count'] = int(info.get('commentcnt')) if info.get('commentcnt') else 0
            if item['comment_count']:
                item['browse_count'] = item['comment_count']*random.randint(300,500)
            else:
                item['comment_count'] = random.randint(0,500)
                item['browse_count'] = item['comment_count']*random.randint(100,1000)
            # item['browse_count'] = int(info.get('urlpv')) if info.get('urlpv') else 0
            item['site_classify_id'] = cate
            item['site_id'] = 110 #110表示东方头条
            # print(item)
            yield item
